To what degree is it possible to predict the number of runs a baseball team will score in a game using known statistical data available before the start of the game?
A final model containing all features. Training set is composed of game-by-game WHIP, DER and OPS. Test set is season to date average WHIP, DER and OPS.
### load packages
# install.packages("FNN")
# install.packages("rpart")
# install.packages("rpart.plot")
# install.packages("rattle")
# install.packages("ggplot2")
# install.packages("knitr")
# install.packages("kableExtra")
# install.packages("gridExtra")
# install.packages("tidyverse")
# install.packages("dplyr")
# install.packages('Boruta')
### set working directory
# setwd(dirname(rstudioapi::getActiveDocumentContext()$path))
setwd("/Users/geoffcooper/Dropbox/CKME136/ckme136/FinalResults_20200727")
wd = getwd()
### open tor_games raw data
datafile = paste(wd,"/tor_games_20200727.csv",sep="")
tor_games = read.csv(datafile)
head(tor_games)
tor_games_training <- data.frame(tor_games$tor_runs,tor_games$game_park_factor,tor_games$tor_home,tor_games$opp_pitcher_whip_game,tor_games$opp_team_der_game,tor_games$drurb001_ops_game,tor_games$bichb001_ops_game,tor_games$biggc002_ops_game,tor_games$davij007_ops_game,tor_games$fishd001_ops_game,tor_games$galvf001_ops_game,tor_games$gricr001_ops_game,tor_games$guerv002_ops_game,tor_games$hernt002_ops_game,tor_games$gurrl001_ops_game,tor_games$jansd001_ops_game,tor_games$maill001_ops_game,tor_games$mcgur002_ops_game,tor_games$mckib001_ops_game,tor_games$pillk001_ops_game,tor_games$smoaj001_ops_game,tor_games$sogae001_ops_game,tor_games$tellr001_ops_game)
names(tor_games_training) <- c('tor_runs','game_park_factor','tor_home','opp_pitcher_whip','opp_team_der','drurb001_ops','bichb001_ops','biggc002_ops','davij007_ops','fishd001_ops','galvf001_ops','gricr001_ops','guerv002_ops','hernt002_ops','gurrl001_ops','jansd001_ops','maill001_ops','mcgur002_ops','mckib001_ops','pillk001_ops','smoaj001_ops','sogae001_ops','tellr001_ops')
head(tor_games_training)
library(dplyr)
tor_games_test <- data.frame(tor_games$game_date,tor_games$tor_runs,tor_games$game_park_factor,tor_games$tor_home,tor_games$opp_pitcher_whip_todate,tor_games$opp_team_der_todate,tor_games$drurb001_ops_todate,tor_games$bichb001_ops_todate,tor_games$biggc002_ops_todate,tor_games$davij007_ops_todate,tor_games$fishd001_ops_todate,tor_games$galvf001_ops_todate,tor_games$gricr001_ops_todate,tor_games$guerv002_ops_todate,tor_games$hernt002_ops_todate,tor_games$gurrl001_ops_todate,tor_games$jansd001_ops_todate,tor_games$maill001_ops_todate,tor_games$mcgur002_ops_todate,tor_games$mckib001_ops_todate,tor_games$pillk001_ops_todate,tor_games$smoaj001_ops_todate,tor_games$sogae001_ops_todate,tor_games$tellr001_ops_todate)
names(tor_games_test) <- c('game_date','tor_runs','game_park_factor','tor_home','opp_pitcher_whip','opp_team_der','drurb001_ops','bichb001_ops','biggc002_ops','davij007_ops','fishd001_ops','galvf001_ops','gricr001_ops','guerv002_ops','hernt002_ops','gurrl001_ops','jansd001_ops','maill001_ops','mcgur002_ops','mckib001_ops','pillk001_ops','smoaj001_ops','sogae001_ops','tellr001_ops')
tor_games_test <- tor_games_test %>% filter(game_date > 20190409)
tor_games_test = subset(tor_games_test, select = -c(game_date))
head(tor_games_test)
library(ggplot2)
ggplot(tor_games_training, aes(x=tor_runs, y=..count..)) + geom_histogram(binwidth=1, alpha=0.5, color="darkblue", fill="lightblue") + geom_density(alpha=0, color="darkblue") + labs(title="Histogram of Toronto Blue Jays runs scored (y) - 2019 season") + geom_vline(aes(xintercept=mean(tor_runs)), color="red", linetype="dashed", size=0.5) + geom_text(aes(x=mean(tor_runs), label=round(mean(tor_runs),2), y=0), colour="red", angle=0, vjust=-1) + theme_classic()
require(gridExtra)
whip_training <- ggplot(tor_games_training, aes(x=opp_pitcher_whip, y=..count..)) + geom_histogram(binwidth=0.25, alpha=0.5, color="darkblue", fill="lightblue") + scale_x_continuous(limits=c(-0.5,4.5),breaks=seq(0,4,0.5)) + scale_y_continuous(limits=c(0,60),breaks=seq(0,50,10)) + geom_density(aes(y=0.25*..count..), alpha=0, color="darkblue") + labs(title="Histogram of Opposition pitcher\nWHIP (x3) - Training set (by game)") + geom_vline(aes(xintercept=mean(opp_pitcher_whip)), color="red", linetype="dashed", size=0.5) + geom_text(aes(x=mean(opp_pitcher_whip), label=round(mean(opp_pitcher_whip),2), y=0), colour="red", angle=0, vjust=-1) + theme_classic()
whip_test <- ggplot(tor_games_test, aes(x=opp_pitcher_whip, y=..count..)) + geom_histogram(binwidth=0.25, alpha=0.5, color="darkblue", fill="lightblue") + scale_x_continuous(limits=c(-0.5,4.5),breaks=seq(0,4,0.5)) + scale_y_continuous(limits=c(0,60),breaks=seq(0,50,10)) + geom_density(aes(y=0.25*..count..), alpha=0, color="darkblue") + labs(title="Histogram of Opposition pitcher\nWHIP (x3) - Test set (average to date)") + geom_vline(aes(xintercept=mean(opp_pitcher_whip)), color="red", linetype="dashed", size=0.5) + geom_text(aes(x=mean(opp_pitcher_whip), label=round(mean(opp_pitcher_whip),2), y=0), colour="red", angle=0, vjust=-1) + theme_classic()
grid.arrange(whip_training, whip_test, ncol=2)
require(gridExtra)
der_training <- ggplot(tor_games_training, aes(x=opp_team_der, y=..count..)) + geom_histogram(binwidth=0.1, alpha=0.5, color="darkblue", fill="lightblue") + scale_x_continuous(limits=c(0.4,1.1),breaks=seq(0.5,1,0.1)) + scale_y_continuous(limits=c(0,90),breaks=seq(0,70,10)) + geom_density(aes(y=0.1*..count..), alpha=0, color="darkblue") + labs(title="Histogram of Opposition team DER\n(x4) - Training set (by game)") + geom_vline(aes(xintercept=mean(opp_team_der)), color="red", linetype="dashed", size=0.5) + geom_text(aes(x=mean(opp_team_der), label=round(mean(opp_team_der),2), y=0), colour="red", angle=0, vjust=-1) + theme_classic()
der_test <- ggplot(tor_games_test, aes(x=opp_team_der, y=..count..)) + geom_histogram(binwidth=0.02, alpha=0.5, color="darkblue", fill="lightblue") + scale_x_continuous(limits=c(0.4,1.1),breaks=seq(0.5,1,0.1)) + scale_y_continuous(limits=c(0,90),breaks=seq(0,70,10)) + geom_density(aes(y=0.02*..count..), alpha=0, color="darkblue") + labs(title="Histogram of Opposition team DER\n(x4) - Test set (average to date)") + geom_vline(aes(xintercept=mean(opp_team_der)), color="red", linetype="dashed", size=0.5) + geom_text(aes(x=mean(opp_team_der), label=round(mean(opp_team_der),2), y=0), colour="red", angle=0, vjust=-1) + theme_classic()
grid.arrange(der_training, der_test, ncol=2)
require(gridExtra)
biggc002_training <- ggplot(tor_games_training, aes(x=biggc002_ops, y=..count..)) + geom_histogram(binwidth=0.25, alpha=0.5, color="darkblue", fill="lightblue") + scale_x_continuous(limits=c(-0.25,3),breaks=seq(0,3,0.5)) + scale_y_continuous(limits=c(0,90),breaks=seq(0,90,10)) + geom_density(aes(y=0.25*..count..),alpha=0,color="darkblue") + labs(title="Histogram of Cavan Biggio OPS\n(x7) - Training set (by game)") + geom_vline(aes(xintercept=mean(biggc002_ops)), color="red", linetype="dashed", size=0.5) + geom_text(aes(x=mean(biggc002_ops), label=round(mean(biggc002_ops),2), y=0), colour="red", angle=0, vjust=-1) + theme_classic()
biggc002_test <- ggplot(tor_games_test, aes(x=biggc002_ops, y=..count..)) + geom_histogram(binwidth=0.25, alpha=0.5, color="darkblue", fill="lightblue") + scale_x_continuous(limits=c(-0.25,3),breaks=seq(0,3,0.5)) + scale_y_continuous(limits=c(0,90),breaks=seq(0,90,10)) + geom_density(aes(y=0.25*..count..),alpha=0, color="darkblue") + labs(title="Histogram of Cavan Biggio OPS\n(x7) - Test set (average to date)") + geom_vline(aes(xintercept=mean(biggc002_ops)), color="red", linetype="dashed", size=0.5) + geom_text(aes(x=mean(biggc002_ops), label=round(mean(biggc002_ops),2), y=0), colour="red", angle=0, vjust=-1) + theme_classic()
grid.arrange(biggc002_training, biggc002_test, ncol=2)
require(gridExtra)
gricr001_training <- ggplot(tor_games_training, aes(x=gricr001_ops, y=..count..)) + geom_histogram(binwidth=0.25, alpha=0.5, color="darkblue", fill="lightblue") + scale_x_continuous(limits=c(-0.25,3),breaks=seq(0,3,0.5)) + scale_y_continuous(limits=c(0,540),breaks=seq(0,160,40)) + geom_density(aes(y=0.25*..count..),alpha=0,color="darkblue") + labs(title="Histogram of Randal Grichuk OPS\n(x11) - Training set (by game)") + geom_vline(aes(xintercept=mean(gricr001_ops)), color="red", linetype="dashed", size=0.5) + geom_text(aes(x=mean(gricr001_ops), label=round(mean(gricr001_ops),2), y=0), colour="red", angle=0, vjust=-1) + theme_classic()
gricr001_test <- ggplot(tor_games_test, aes(x=gricr001_ops, y=..count..)) + geom_histogram(binwidth=0.25, alpha=0.5, color="darkblue", fill="lightblue") + scale_x_continuous(limits=c(-0.25,3),breaks=seq(0,3,0.5)) + scale_y_continuous(limits=c(0,540),breaks=seq(0,160,40)) + geom_density(aes(y=0.25*..count..),alpha=0, color="darkblue") + labs(title="Histogram of Randal Grichuk OPS\n(x11) - Test set (average to date)") + geom_vline(aes(xintercept=mean(gricr001_ops)), color="red", linetype="dashed", size=0.5) + geom_text(aes(x=mean(gricr001_ops), label=round(mean(gricr001_ops),2), y=0), colour="red", angle=0, vjust=-1) + theme_classic()
grid.arrange(gricr001_training, gricr001_test, ncol=2)
require(gridExtra)
guerv002_training <- ggplot(tor_games_training, aes(x=guerv002_ops, y=..count..)) + geom_histogram(binwidth=0.25, alpha=0.5, color="darkblue", fill="lightblue") + scale_x_continuous(limits=c(-0.25,3),breaks=seq(0,3,0.5)) + scale_y_continuous(limits=c(0,130),breaks=seq(0,120,20)) + geom_density(aes(y=0.25*..count..),alpha=0,color="darkblue") + labs(title="Histogram of Vladimir Guerrero Jr\nOPS (x12) - Training set (by game)") + geom_vline(aes(xintercept=mean(guerv002_ops)), color="red", linetype="dashed", size=0.5) + geom_text(aes(x=mean(guerv002_ops), label=round(mean(guerv002_ops),2), y=0), colour="red", angle=0, vjust=-1) + theme_classic()
guerv002_test <- ggplot(tor_games_test, aes(x=guerv002_ops, y=..count..)) + geom_histogram(binwidth=0.25, alpha=0.5, color="darkblue", fill="lightblue") + scale_x_continuous(limits=c(-0.25,3),breaks=seq(0,3,0.5)) + scale_y_continuous(limits=c(0,130),breaks=seq(0,120,20)) + geom_density(aes(y=0.25*..count..),alpha=0, color="darkblue") + labs(title="Histogram of Vladimir Guerrero Jr\nOPS (x12) - Test set (average to date)") + geom_vline(aes(xintercept=mean(guerv002_ops)), color="red", linetype="dashed", size=0.5) + geom_text(aes(x=mean(guerv002_ops), label=round(mean(guerv002_ops),2), y=0), colour="red", angle=0, vjust=-1) + theme_classic()
grid.arrange(guerv002_training, guerv002_test, ncol=2)
models_compare_training <- data.frame("Model"=character(),"Correlation_r"=double(),"Mean_squared_prediction_error"=double(),"Error_distribution"=character())
models_compare_test <- data.frame("Model"=character(),"Correlation_r"=double(),"Mean_squared_prediction_error"=double(),"Error_distribution"=character())
model_linreg <- lm(tor_runs~game_park_factor+tor_home+opp_pitcher_whip+opp_team_der+drurb001_ops+bichb001_ops+biggc002_ops+davij007_ops+fishd001_ops+galvf001_ops+gricr001_ops+guerv002_ops+hernt002_ops+gurrl001_ops+jansd001_ops+maill001_ops+mcgur002_ops+mckib001_ops+pillk001_ops+smoaj001_ops+sogae001_ops+tellr001_ops, tor_games_training)
summary(model_linreg)
##
## Call:
## lm(formula = tor_runs ~ game_park_factor + tor_home + opp_pitcher_whip +
## opp_team_der + drurb001_ops + bichb001_ops + biggc002_ops +
## davij007_ops + fishd001_ops + galvf001_ops + gricr001_ops +
## guerv002_ops + hernt002_ops + gurrl001_ops + jansd001_ops +
## maill001_ops + mcgur002_ops + mckib001_ops + pillk001_ops +
## smoaj001_ops + sogae001_ops + tellr001_ops, data = tor_games_training)
##
## Residuals:
## Min 1Q Median 3Q Max
## -3.9750 -1.0246 -0.0079 1.0842 6.2977
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -1.04572 2.19414 -0.477 0.634397
## game_park_factor 1.64124 1.52906 1.073 0.284968
## tor_home 0.06331 0.30299 0.209 0.834797
## opp_pitcher_whip 0.61667 0.24050 2.564 0.011406 *
## opp_team_der -3.55534 1.90159 -1.870 0.063634 .
## drurb001_ops 1.05968 0.26672 3.973 0.000114 ***
## bichb001_ops 0.97259 0.33756 2.881 0.004590 **
## biggc002_ops 0.96562 0.21894 4.411 2.05e-05 ***
## davij007_ops 1.37884 0.56310 2.449 0.015585 *
## fishd001_ops 0.65759 0.45278 1.452 0.148661
## galvf001_ops 0.69456 0.23043 3.014 0.003063 **
## gricr001_ops 0.85121 0.22450 3.792 0.000222 ***
## guerv002_ops 1.04550 0.23294 4.488 1.49e-05 ***
## hernt002_ops 0.97801 0.17225 5.678 7.66e-08 ***
## gurrl001_ops 1.08369 0.24157 4.486 1.51e-05 ***
## jansd001_ops 1.03540 0.26243 3.945 0.000126 ***
## maill001_ops 1.43475 0.48841 2.938 0.003874 **
## mcgur002_ops 0.29216 0.35494 0.823 0.411848
## mckib001_ops 1.15146 0.29050 3.964 0.000118 ***
## pillk001_ops 7.22145 3.56851 2.024 0.044922 *
## smoaj001_ops 0.36627 0.23319 1.571 0.118533
## sogae001_ops 0.89263 0.26014 3.431 0.000791 ***
## tellr001_ops 0.98589 0.23834 4.136 6.08e-05 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.721 on 139 degrees of freedom
## Multiple R-squared: 0.7555, Adjusted R-squared: 0.7168
## F-statistic: 19.53 on 22 and 139 DF, p-value: < 2.2e-16
linreg_predict_training <- predict(model_linreg, tor_games_training)
linreg_actuals_preds_training <- data.frame(cbind(actuals=tor_games_training$tor_runs, predicteds=linreg_predict_training))
correlation_accuracy_training <- cor(linreg_actuals_preds_training$actuals, y=linreg_actuals_preds_training$predicteds, method=c("spearman"))
# head(linreg_actuals_preds_training)
print("Correlation (r) - Predicted vs Actual")
## [1] "Correlation (r) - Predicted vs Actual"
correlation_accuracy_training
## [1] 0.8729085
linreg_preds_errors <- (linreg_actuals_preds_training$actuals - linreg_actuals_preds_training$predicteds)
linreg_mspe = mean((linreg_preds_errors) ^ 2)
print("Mean squared prediction error")
## [1] "Mean squared prediction error"
linreg_mspe
## [1] 2.541933
print("Error distribution")
## [1] "Error distribution"
sw <- shapiro.test(linreg_preds_errors)
linreg_preds_errors_dist <- if(sw$p.value>0.05) "Normal" else "Not normal"
linreg_preds_errors_dist
## [1] "Normal"
models_compare_training <- rbind(models_compare_training, data.frame("Model"="Linear regression","Correlation_r"=correlation_accuracy_training,"Mean_squared_prediction_error"=linreg_mspe,"Error_distribution"=linreg_preds_errors_dist))
models_compare_test <- rbind(models_compare_test, data.frame("Model"="Linear regression (Training)","Correlation_r"=correlation_accuracy_training,"Mean_squared_prediction_error"=linreg_mspe,"Error_distribution"=linreg_preds_errors_dist))
plot(linreg_actuals_preds_training$actuals, linreg_actuals_preds_training$predicteds, main="Plot of runs/game - Linear regression model - Training data", xlab="Actual (y)", ylab="Predicted (Å·)")
abline(lm(linreg_actuals_preds_training$actuals~linreg_actuals_preds_training$predicteds), col="red")
library(ggplot2)
actuals <- data.frame(tor_games_training$tor_runs)
actuals$dataset <- 'Actual'
names(actuals) <- c('Runs','Dataset')
predicteds <- data.frame(linreg_predict_training)
predicteds$dataset <- 'Predicted'
names(predicteds) <- c('Runs','Dataset')
runs <- rbind(actuals, predicteds)
ggplot(runs, aes(x=Runs, y=..count.., color=Dataset, fill=Dataset)) + geom_histogram(binwidth=1, alpha=0.5, position="dodge") + geom_density(alpha=0) + scale_fill_manual(values=c("darkblue", "lightblue")) +
scale_color_manual(values=c("darkblue", "lightblue")) + labs(title="Histogram of Runs/Game by Toronto Blue Jays (2019 Season)\nLinear regression model - Actual vs Predicted",x="Runs", y="Games") + theme_classic()
library(dplyr)
# remove dependent variable
knn_tor_games_actuals <- tor_games_training %>% select(tor_runs)
knn_tor_games_training <- tor_games_training %>% select(-tor_runs)
# set the seed to make the partition reproducible
set.seed(1234)
# train using 75% of the sample size
smp_size <- floor(0.75 * nrow(knn_tor_games_training))
train_ind <- sample(seq_len(nrow(knn_tor_games_training)), size = smp_size)
# creating test and training sets
knn_train <- knn_tor_games_training[train_ind, ]
knn_test <- knn_tor_games_training[-train_ind, ]
knn_train_actuals <- knn_tor_games_actuals[train_ind, ]
knn_test_actuals <- knn_tor_games_actuals[-train_ind, ]
library(FNN)
model_knn <- knn.reg(knn_train, knn_test, knn_train_actuals, k=11)
model_knn
## Prediction:
## [1] 0.5454545 1.8181818 4.8181818 2.6363636 4.0000000 1.3636364 1.8181818
## [8] 3.6363636 2.2727273 1.6363636 2.3636364 2.5454545 1.0909091 3.3636364
## [15] 7.3636364 3.0000000 1.3636364 2.5454545 1.1818182 3.6363636 4.4545455
## [22] 3.7272727 3.2727273 2.8181818 6.0909091 3.0909091 2.6363636 5.5454545
## [29] 3.1818182 4.6363636 3.3636364 0.5454545 3.4545455 4.8181818 3.0909091
## [36] 0.5454545 3.6363636 4.6363636 3.9090909 3.0909091 1.1818182
knn_cor <- cor(knn_test_actuals, y=model_knn$pred, method=c("spearman"))
# head(model_knn$pred)
print("Correlation (r) - Predicted vs Actual")
## [1] "Correlation (r) - Predicted vs Actual"
knn_cor
## [1] 0.6882425
knn_preds_errors <- (knn_test_actuals - model_knn$pred)
knn_mspe = mean((knn_preds_errors) ^ 2)
print("Mean squared prediction error")
## [1] "Mean squared prediction error"
knn_mspe
## [1] 9.338037
print("Error distribution")
## [1] "Error distribution"
sw <- shapiro.test(knn_preds_errors)
knn_preds_errors_dist <- if(sw$p.value>0.05) "Normal" else "Not normal"
knn_preds_errors_dist
## [1] "Normal"
models_compare_training <- rbind(models_compare_training, data.frame("Model"="k-NN regression","Correlation_r"=knn_cor,"Mean_squared_prediction_error"=knn_mspe,"Error_distribution"=knn_preds_errors_dist))
plot(knn_test_actuals, model_knn$pred, main="Plot of runs/game - k-NN regression model - Training data", xlab="Actual (y)", ylab="Predicted (Å·)")
abline(lm(model_knn$pred~knn_test_actuals), col="red")
library(ggplot2)
actuals <- data.frame(knn_test_actuals)
actuals$dataset <- 'Actual'
names(actuals) <- c('Runs','Dataset')
predicteds <- data.frame(model_knn$pred)
predicteds$dataset <- 'Predicted'
names(predicteds) <- c('Runs','Dataset')
runs <- rbind(actuals, predicteds)
ggplot(runs, aes(x=Runs, y=..count.., color=Dataset, fill=Dataset)) + geom_histogram(binwidth=1, alpha=0.5, position="dodge") + geom_density(alpha=0) + scale_fill_manual(values=c("darkblue", "lightblue")) +
scale_color_manual(values=c("darkblue", "lightblue")) + labs(title="Histogram of Runs/Game by Toronto Blue Jays (2019 Season)\nk-NN regression model - Actual vs Predicted",x="Runs", y="Games") + theme_classic()
library(rpart)
model_regtree <- rpart(tor_runs~game_park_factor+tor_home+opp_pitcher_whip+opp_team_der+drurb001_ops+bichb001_ops+biggc002_ops+davij007_ops+fishd001_ops+galvf001_ops+gricr001_ops+guerv002_ops+hernt002_ops+gurrl001_ops+jansd001_ops+maill001_ops+mcgur002_ops+mckib001_ops+pillk001_ops+smoaj001_ops+sogae001_ops+tellr001_ops, method="anova", data=tor_games_training)
summary(model_regtree)
## Call:
## rpart(formula = tor_runs ~ game_park_factor + tor_home + opp_pitcher_whip +
## opp_team_der + drurb001_ops + bichb001_ops + biggc002_ops +
## davij007_ops + fishd001_ops + galvf001_ops + gricr001_ops +
## guerv002_ops + hernt002_ops + gurrl001_ops + jansd001_ops +
## maill001_ops + mcgur002_ops + mckib001_ops + pillk001_ops +
## smoaj001_ops + sogae001_ops + tellr001_ops, data = tor_games_training,
## method = "anova")
## n= 162
##
## CP nsplit rel error xerror xstd
## 1 0.35721276 0 1.0000000 1.0106205 0.14958481
## 2 0.10122683 1 0.6427872 0.7660027 0.12160672
## 3 0.07836614 2 0.5415604 0.6681215 0.11442280
## 4 0.03420656 3 0.4631943 0.6369548 0.09780703
## 5 0.02999152 4 0.4289877 0.6772146 0.10215383
## 6 0.02440076 5 0.3989962 0.6725543 0.09473910
## 7 0.02103845 6 0.3745954 0.6755489 0.09443159
## 8 0.01662439 7 0.3535570 0.6550914 0.09184229
## 9 0.01175462 8 0.3369326 0.6466864 0.08809536
## 10 0.01077065 9 0.3251780 0.6505809 0.08933213
## 11 0.01000000 10 0.3144073 0.6520036 0.08929205
##
## Variable importance
## opp_pitcher_whip opp_team_der biggc002_ops guerv002_ops
## 36 22 10 5
## smoaj001_ops gricr001_ops drurb001_ops fishd001_ops
## 4 4 3 3
## mcgur002_ops jansd001_ops galvf001_ops game_park_factor
## 3 3 1 1
## tor_home gurrl001_ops sogae001_ops tellr001_ops
## 1 1 1 1
## bichb001_ops
## 1
##
## Node number 1: 162 observations, complexity param=0.3572128
## mean=4.481481, MSE=10.39781
## left son=2 (113 obs) right son=3 (49 obs)
## Primary splits:
## opp_pitcher_whip < 1.55 to the left, improve=0.3572128, (0 missing)
## opp_team_der < 0.7245 to the right, improve=0.2791761, (0 missing)
## guerv002_ops < 1.05 to the left, improve=0.2106361, (0 missing)
## biggc002_ops < 0.55 to the left, improve=0.1808705, (0 missing)
## hernt002_ops < 0.775 to the left, improve=0.1527696, (0 missing)
## Surrogate splits:
## opp_team_der < 0.587 to the right, agree=0.747, adj=0.163, (0 split)
## drurb001_ops < 1.325 to the left, agree=0.728, adj=0.102, (0 split)
## biggc002_ops < 1.8165 to the left, agree=0.728, adj=0.102, (0 split)
## smoaj001_ops < 1.883 to the left, agree=0.728, adj=0.102, (0 split)
## jansd001_ops < 1.775 to the left, agree=0.722, adj=0.082, (0 split)
##
## Node number 2: 113 observations, complexity param=0.1012268
## mean=3.212389, MSE=5.671705
## left son=4 (71 obs) right son=5 (42 obs)
## Primary splits:
## opp_team_der < 0.7235 to the right, improve=0.2660482, (0 missing)
## biggc002_ops < 1.3335 to the left, improve=0.2056460, (0 missing)
## guerv002_ops < 1.05 to the left, improve=0.1748234, (0 missing)
## opp_pitcher_whip < 1.134 to the left, improve=0.1571339, (0 missing)
## gricr001_ops < 0.7915 to the left, improve=0.1481078, (0 missing)
## Surrogate splits:
## biggc002_ops < 0.775 to the left, agree=0.735, adj=0.286, (0 split)
## opp_pitcher_whip < 1.0625 to the left, agree=0.699, adj=0.190, (0 split)
## guerv002_ops < 1.05 to the left, agree=0.681, adj=0.143, (0 split)
## mcgur002_ops < 0.25 to the left, agree=0.673, adj=0.119, (0 split)
## gricr001_ops < 0.55 to the left, agree=0.664, adj=0.095, (0 split)
##
## Node number 3: 49 observations, complexity param=0.07836614
## mean=7.408163, MSE=9.017076
## left son=6 (42 obs) right son=7 (7 obs)
## Primary splits:
## opp_team_der < 0.558 to the right, improve=0.2987606, (0 missing)
## hernt002_ops < 0.8165 to the left, improve=0.2228298, (0 missing)
## guerv002_ops < 1.0165 to the left, improve=0.2214362, (0 missing)
## gricr001_ops < 1.256 to the left, improve=0.1493609, (0 missing)
## opp_pitcher_whip < 2.9635 to the left, improve=0.1382294, (0 missing)
##
## Node number 4: 71 observations, complexity param=0.02440076
## mean=2.267606, MSE=3.041063
## left son=8 (52 obs) right son=9 (19 obs)
## Primary splits:
## gricr001_ops < 0.7915 to the left, improve=0.1903603, (0 missing)
## opp_pitcher_whip < 0.845 to the left, improve=0.1667133, (0 missing)
## hernt002_ops < 1.125 to the left, improve=0.1464811, (0 missing)
## jansd001_ops < 0.3665 to the left, improve=0.1226668, (0 missing)
## opp_team_der < 0.8975 to the right, improve=0.0766997, (0 missing)
## Surrogate splits:
## opp_pitcher_whip < 1.21 to the left, agree=0.845, adj=0.421, (0 split)
## sogae001_ops < 1.4585 to the left, agree=0.803, adj=0.263, (0 split)
## game_park_factor < 0.9 to the right, agree=0.746, adj=0.053, (0 split)
## bichb001_ops < 0.125 to the left, agree=0.746, adj=0.053, (0 split)
##
## Node number 5: 42 observations, complexity param=0.03420656
## mean=4.809524, MSE=6.058957
## left son=10 (35 obs) right son=11 (7 obs)
## Primary splits:
## biggc002_ops < 1.3335 to the left, improve=0.2264222, (0 missing)
## game_park_factor < 1.047 to the left, improve=0.2253941, (0 missing)
## guerv002_ops < 1.05 to the left, improve=0.2077189, (0 missing)
## opp_team_der < 0.613 to the right, improve=0.1383982, (0 missing)
## galvf001_ops < 1.175 to the left, improve=0.1197605, (0 missing)
##
## Node number 6: 42 observations, complexity param=0.02999152
## mean=6.738095, MSE=5.431406
## left son=12 (7 obs) right son=13 (35 obs)
## Primary splits:
## fishd001_ops < 0.125 to the right, improve=0.2214591, (0 missing)
## guerv002_ops < 1.3 to the left, improve=0.2214591, (0 missing)
## mcgur002_ops < 0.45 to the right, improve=0.1964096, (0 missing)
## bichb001_ops < 0.2 to the right, improve=0.1344972, (0 missing)
## gricr001_ops < 1.256 to the left, improve=0.1237658, (0 missing)
## Surrogate splits:
## mcgur002_ops < 0.45 to the right, agree=0.952, adj=0.714, (0 split)
## bichb001_ops < 0.2 to the right, agree=0.857, adj=0.143, (0 split)
## biggc002_ops < 1.919 to the right, agree=0.857, adj=0.143, (0 split)
##
## Node number 7: 7 observations
## mean=11.42857, MSE=11.67347
##
## Node number 8: 52 observations
## mean=1.807692, MSE=2.039941
##
## Node number 9: 19 observations
## mean=3.526316, MSE=3.617729
##
## Node number 10: 35 observations, complexity param=0.01662439
## mean=4.285714, MSE=4.37551
## left son=20 (25 obs) right son=21 (10 obs)
## Primary splits:
## guerv002_ops < 0.825 to the left, improve=0.18285450, (0 missing)
## gricr001_ops < 1.3415 to the left, improve=0.16791040, (0 missing)
## game_park_factor < 1.0245 to the left, improve=0.15236320, (0 missing)
## sogae001_ops < 0.883 to the left, improve=0.14109140, (0 missing)
## hernt002_ops < 0.9165 to the left, improve=0.09444963, (0 missing)
## Surrogate splits:
## fishd001_ops < 0.125 to the left, agree=0.800, adj=0.3, (0 split)
## game_park_factor < 1.047 to the left, agree=0.771, adj=0.2, (0 split)
## opp_team_der < 0.568 to the right, agree=0.771, adj=0.2, (0 split)
## galvf001_ops < 1.175 to the left, agree=0.771, adj=0.2, (0 split)
## gricr001_ops < 0.125 to the right, agree=0.771, adj=0.2, (0 split)
##
## Node number 11: 7 observations
## mean=7.428571, MSE=6.244898
##
## Node number 12: 7 observations
## mean=4.285714, MSE=0.7755102
##
## Node number 13: 35 observations, complexity param=0.02103845
## mean=7.228571, MSE=4.919184
## left son=26 (15 obs) right son=27 (20 obs)
## Primary splits:
## guerv002_ops < 0.3 to the left, improve=0.20583030, (0 missing)
## gricr001_ops < 0.55 to the left, improve=0.12809200, (0 missing)
## hernt002_ops < 1.1 to the left, improve=0.12391030, (0 missing)
## biggc002_ops < 0.95 to the left, improve=0.11745480, (0 missing)
## smoaj001_ops < 0.225 to the right, improve=0.05837042, (0 missing)
## Surrogate splits:
## smoaj001_ops < 0.625 to the right, agree=0.714, adj=0.333, (0 split)
## tellr001_ops < 0.325 to the right, agree=0.686, adj=0.267, (0 split)
## opp_team_der < 0.712 to the right, agree=0.657, adj=0.200, (0 split)
## galvf001_ops < 0.9 to the right, agree=0.657, adj=0.200, (0 split)
## gricr001_ops < 0.1 to the left, agree=0.657, adj=0.200, (0 split)
##
## Node number 20: 25 observations, complexity param=0.01077065
## mean=3.72, MSE=3.6416
## left son=40 (12 obs) right son=41 (13 obs)
## Primary splits:
## game_park_factor < 1.0245 to the left, improve=0.1992812, (0 missing)
## guerv002_ops < 0.2 to the right, improve=0.1549448, (0 missing)
## tor_home < 0.5 to the left, improve=0.1470140, (0 missing)
## gricr001_ops < 0.7915 to the left, improve=0.1230257, (0 missing)
## biggc002_ops < 0.45 to the right, improve=0.1080146, (0 missing)
## Surrogate splits:
## tor_home < 0.5 to the left, agree=0.92, adj=0.833, (0 split)
## guerv002_ops < 0.45 to the right, agree=0.68, adj=0.333, (0 split)
## biggc002_ops < 0.325 to the right, agree=0.64, adj=0.250, (0 split)
## davij007_ops < 0.4585 to the right, agree=0.64, adj=0.250, (0 split)
## galvf001_ops < 0.125 to the right, agree=0.64, adj=0.250, (0 split)
##
## Node number 21: 10 observations
## mean=5.7, MSE=3.41
##
## Node number 26: 15 observations
## mean=6.066667, MSE=1.262222
##
## Node number 27: 20 observations, complexity param=0.01175462
## mean=8.1, MSE=5.89
## left son=54 (9 obs) right son=55 (11 obs)
## Primary splits:
## opp_pitcher_whip < 2.155 to the right, improve=0.1680815, (0 missing)
## hernt002_ops < 0.9 to the left, improve=0.1613650, (0 missing)
## gricr001_ops < 0.55 to the left, improve=0.1369553, (0 missing)
## mckib001_ops < 0.225 to the left, improve=0.1189162, (0 missing)
## jansd001_ops < 0.575 to the left, improve=0.0994235, (0 missing)
## Surrogate splits:
## gurrl001_ops < 0.8165 to the left, agree=0.80, adj=0.556, (0 split)
## galvf001_ops < 0.775 to the left, agree=0.75, adj=0.444, (0 split)
## hernt002_ops < 0.343 to the left, agree=0.70, adj=0.333, (0 split)
## jansd001_ops < 0.575 to the left, agree=0.70, adj=0.333, (0 split)
## mckib001_ops < 0.525 to the right, agree=0.70, adj=0.333, (0 split)
##
## Node number 40: 12 observations
## mean=2.833333, MSE=1.805556
##
## Node number 41: 13 observations
## mean=4.538462, MSE=3.940828
##
## Node number 54: 9 observations
## mean=7, MSE=5.333333
##
## Node number 55: 11 observations
## mean=9, MSE=4.545455
library(rpart.plot)
library(rattle)
fancyRpartPlot(model_regtree, main="Regression tree model")
regtree_preds <- predict(model_regtree,type = "vector")
# head(regtree_preds)
regtree_cor <- cor(tor_games_training$tor_runs, y=regtree_preds, method=c("spearman"))
print("Correlation (r) - Predicted vs Actual")
## [1] "Correlation (r) - Predicted vs Actual"
regtree_cor
## [1] 0.813598
regtree_preds_errors <- (tor_games_training$tor_runs - regtree_preds)
regtree_mspe = mean((regtree_preds_errors) ^ 2)
print("Mean squared prediction error")
## [1] "Mean squared prediction error"
regtree_mspe
## [1] 3.269146
print("Error distribution")
## [1] "Error distribution"
sw <- shapiro.test(regtree_preds_errors)
regtree_preds_errors_dist <- if(sw$p.value>0.05) "Normal" else "Not normal"
regtree_preds_errors_dist
## [1] "Not normal"
models_compare_training <- rbind(models_compare_training, data.frame("Model"="Regression tree","Correlation_r"=regtree_cor,"Mean_squared_prediction_error"=regtree_mspe,"Error_distribution"=regtree_preds_errors_dist))
plot(tor_games_training$tor_runs, jitter(regtree_preds,2), main="Plot of runs/game - Regression tree model - Training data", xlab="Actual (y)", ylab="Predicted (Å·)")
abline(lm(regtree_preds~tor_games_training$tor_runs), col="red")
library(ggplot2)
actuals <- data.frame(tor_games_training$tor_runs)
actuals$dataset <- 'Actual'
names(actuals) <- c('Runs','Dataset')
predicteds <- data.frame(regtree_preds)
predicteds$dataset <- 'Predicted'
names(predicteds) <- c('Runs','Dataset')
runs <- rbind(actuals, predicteds)
ggplot(runs, aes(x=Runs, y=..count.., color=Dataset, fill=Dataset)) + geom_histogram(binwidth=1, alpha=0.5, position="dodge") + geom_density(alpha=0) + scale_fill_manual(values=c("darkblue", "lightblue")) +
scale_color_manual(values=c("darkblue", "lightblue")) + labs(title="Histogram of Runs/Game by Toronto Blue Jays (2019 Season)\nRegression tree model - Actual vs Predicted",x="Runs", y="Games") + theme_classic()
library(knitr)
library(kableExtra)
models_compare_training %>% kable() %>% kable_styling()
| Model | Correlation_r | Mean_squared_prediction_error | Error_distribution |
|---|---|---|---|
| Linear regression | 0.8729085 | 2.541933 | Normal |
| k-NN regression | 0.6882425 | 9.338037 | Normal |
| Regression tree | 0.8135980 | 3.269146 | Not normal |
linreg_predict_test <- predict(model_linreg, tor_games_test)
linreg_actuals_preds_test <- data.frame(cbind(actuals=tor_games_test$tor_runs, predicteds=linreg_predict_test))
correlation_accuracy_test <- cor(linreg_actuals_preds_test$actuals, y=linreg_actuals_preds_test$predicteds, method=c("spearman"))
# head(linreg_actuals_preds_test)
print("Correlation (r) - Predicted vs Actual")
## [1] "Correlation (r) - Predicted vs Actual"
correlation_accuracy_test
## [1] 0.23505
linreg_preds_errors <- (linreg_actuals_preds_test$actuals - linreg_actuals_preds_test$predicteds)
linreg_mspe = mean((linreg_preds_errors) ^ 2)
print("Mean squared prediction error")
## [1] "Mean squared prediction error"
linreg_mspe
## [1] 10.16653
print("Error distribution")
## [1] "Error distribution"
sw <- shapiro.test(linreg_preds_errors)
linreg_preds_errors_dist <- if(sw$p.value>0.05) "Normal" else "Not normal"
linreg_preds_errors_dist
## [1] "Not normal"
models_compare_test <- rbind(models_compare_test, data.frame("Model"="Linear regression (Test - baseline)","Correlation_r"=correlation_accuracy_test,"Mean_squared_prediction_error"=linreg_mspe,"Error_distribution"=linreg_preds_errors_dist))
plot(linreg_actuals_preds_test$actuals, linreg_actuals_preds_test$predicteds, main="Plot of runs/game - Linear regression model - Test data - baseline", xlab="Actual (y)", ylab="Predicted (Å·)")
abline(lm(linreg_actuals_preds_test$actuals~linreg_actuals_preds_test$predicteds), col="red")
actuals <- data.frame(tor_games_test$tor_runs)
actuals$dataset <- 'Actual'
names(actuals) <- c('Runs','Dataset')
predicteds <- data.frame(linreg_predict_test)
predicteds$dataset <- 'Predicted'
names(predicteds) <- c('Runs','Dataset')
runs <- rbind(actuals, predicteds)
ggplot(runs, aes(x=Runs, y=..count.., color=Dataset, fill=Dataset)) + geom_histogram(binwidth=1, alpha=0.5, position="dodge") + geom_density(alpha=0) + scale_fill_manual(values=c("darkblue", "lightblue")) +
scale_color_manual(values=c("darkblue", "lightblue")) + labs(title="Histogram of Runs/Game by Toronto Blue Jays (2019 Season)\nActuals vs Test Set results - baseline",x="Runs", y="Games") + theme_classic()
library(Boruta)
boruta_output <- Boruta(tor_runs~., data=tor_games_training, doTrace=0, maxRuns=500)
boruta_output
## Boruta performed 499 iterations in 14.68618 secs.
## 11 attributes confirmed important: biggc002_ops, drurb001_ops,
## game_park_factor, gricr001_ops, guerv002_ops and 6 more;
## 10 attributes confirmed unimportant: bichb001_ops, davij007_ops,
## fishd001_ops, galvf001_ops, maill001_ops and 5 more;
## 1 tentative attributes left: sogae001_ops;
plot(boruta_output, cex.axis=.7, las=2, xlab="", main="Variable Importance")
boruta_table <- attStats(boruta_output)
boruta_table
### remove outlier, one game where runs > 13
tor_games_training <- tor_games_training %>% filter(tor_runs < 13)
tor_games_test <- tor_games_test %>% filter(tor_runs < 13)
model_linreg_final <- lm(tor_runs~opp_pitcher_whip+opp_team_der+biggc002_ops+gricr001_ops+guerv002_ops+hernt002_ops+gurrl001_ops+jansd001_ops, tor_games_training)
summary(model_linreg_final)
##
## Call:
## lm(formula = tor_runs ~ opp_pitcher_whip + opp_team_der + biggc002_ops +
## gricr001_ops + guerv002_ops + hernt002_ops + gurrl001_ops +
## jansd001_ops, data = tor_games_training)
##
## Residuals:
## Min 1Q Median 3Q Max
## -4.3417 -1.2306 -0.4019 1.2832 5.3150
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 5.9452 1.5713 3.784 0.000222 ***
## opp_pitcher_whip 1.2809 0.2283 5.611 9.28e-08 ***
## opp_team_der -7.0450 1.8123 -3.887 0.000151 ***
## biggc002_ops 0.6065 0.2167 2.799 0.005789 **
## gricr001_ops 0.5769 0.2291 2.518 0.012837 *
## guerv002_ops 0.6686 0.2367 2.825 0.005368 **
## hernt002_ops 0.6735 0.1789 3.764 0.000238 ***
## gurrl001_ops 0.8806 0.2351 3.746 0.000254 ***
## jansd001_ops 0.5427 0.2644 2.052 0.041872 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.887 on 152 degrees of freedom
## Multiple R-squared: 0.6325, Adjusted R-squared: 0.6132
## F-statistic: 32.7 on 8 and 152 DF, p-value: < 2.2e-16
linreg_predict_final <- predict(model_linreg_final, tor_games_test)
linreg_actuals_preds_test <- data.frame(cbind(actuals=tor_games_test$tor_runs, predicteds=linreg_predict_test))
## Warning in cbind(actuals = tor_games_test$tor_runs, predicteds =
## linreg_predict_test): number of rows of result is not a multiple of vector
## length (arg 1)
correlation_accuracy_test <- cor(linreg_actuals_preds_test$actuals, y=linreg_actuals_preds_test$predicteds, method=c("spearman"))
# head(linreg_actuals_preds_test)
print("Correlation (r) - Predicted vs Actual")
## [1] "Correlation (r) - Predicted vs Actual"
correlation_accuracy_test
## [1] 0.1481678
linreg_preds_errors <- (linreg_actuals_preds_test$actuals - linreg_actuals_preds_test$predicteds)
linreg_mspe = mean((linreg_preds_errors) ^ 2)
print("Mean squared prediction error")
## [1] "Mean squared prediction error"
linreg_mspe
## [1] 9.28426
print("Error distribution")
## [1] "Error distribution"
sw <- shapiro.test(linreg_preds_errors)
linreg_preds_errors_dist <- if(sw$p.value>0.05) "Normal" else "Not normal"
linreg_preds_errors_dist
## [1] "Not normal"
models_compare_test <- rbind(models_compare_test, data.frame("Model"="Linear regression (Test - final)","Correlation_r"=correlation_accuracy_test,"Mean_squared_prediction_error"=linreg_mspe,"Error_distribution"=linreg_preds_errors_dist))
plot(linreg_actuals_preds_test$actuals, linreg_actuals_preds_test$predicteds, main="Plot of runs/game - Linear regression model - Test data - final", xlab="Actual (y)", ylab="Predicted (Å·)")
abline(lm(linreg_actuals_preds_test$actuals~linreg_actuals_preds_test$predicteds), col="red")
actuals <- data.frame(tor_games_test$tor_runs)
actuals$dataset <- 'Actual'
names(actuals) <- c('Runs','Dataset')
predicteds <- data.frame(linreg_predict_final)
predicteds$dataset <- 'Predicted'
names(predicteds) <- c('Runs','Dataset')
runs <- rbind(actuals, predicteds)
ggplot(runs, aes(x=Runs, y=..count.., color=Dataset, fill=Dataset)) + geom_histogram(binwidth=1, alpha=0.5, position="dodge") + geom_density(alpha=0) + scale_fill_manual(values=c("darkblue", "lightblue")) +
scale_color_manual(values=c("darkblue", "lightblue")) + labs(title="Histogram of Runs/Game by Toronto Blue Jays (2019 Season)\nActuals vs Test Set results - revised model",x="Runs", y="Games") + theme_classic()
library(knitr)
library(kableExtra)
models_compare_test %>% kable() %>% kable_styling()
| Model | Correlation_r | Mean_squared_prediction_error | Error_distribution |
|---|---|---|---|
| Linear regression (Training) | 0.8729085 | 2.541933 | Normal |
| Linear regression (Test - baseline) | 0.2350500 | 10.166530 | Not normal |
| Linear regression (Test - final) | 0.1481678 | 9.284260 | Not normal |